In [30]:
#Resources

We're going to do sum EDA¶

  • a quick look at age and ediucation level dist
  • Try to comapre answeres by different roles (data scientist, data analyst,...)
In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
import os
In [2]:
df = pd.read_csv('kaggle_survey_2021_responses.csv')
df.shape
C:\Users\Thinkpad\AppData\Local\Temp\ipykernel_14620\98025904.py:1: DtypeWarning: Columns (0,195,201,285,286,287,288,289,290,291,292) have mixed types. Specify dtype option on import or set low_memory=False.
  df = pd.read_csv('kaggle_survey_2021_responses.csv')
Out[2]:
(25974, 369)
In [3]:
df.describe()
Out[3]:
Time from Start to Finish (seconds) Q1 Q2 Q3 Q4 Q5 Q6 Q7_Part_1 Q7_Part_2 Q7_Part_3 ... Q38_B_Part_3 Q38_B_Part_4 Q38_B_Part_5 Q38_B_Part_6 Q38_B_Part_7 Q38_B_Part_8 Q38_B_Part_9 Q38_B_Part_10 Q38_B_Part_11 Q38_B_OTHER
count 25974 25974 25974 25974 25974 25974 25974 21861 5335 10757 ... 634 592 4240 730 738 1021 667 2748 4543 378
unique 5410 12 6 67 8 16 8 2 2 2 ... 2 2 2 2 2 2 2 2 2 2
top 484 25-29 Man India Master’s degree Student 1-3 years Python R SQL ... Comet.ml Sacred + Omniboard TensorBoard Guild.ai Polyaxon ClearML Domino Model Monitor MLflow None Other
freq 42 4931 20598 7434 10132 6804 7874 21860 5334 10756 ... 633 591 4239 729 737 1020 666 2747 4542 377

4 rows × 369 columns

In [4]:
df.dtypes
Out[4]:
Time from Start to Finish (seconds)    object
Q1                                     object
Q2                                     object
Q3                                     object
Q4                                     object
                                        ...  
Q38_B_Part_8                           object
Q38_B_Part_9                           object
Q38_B_Part_10                          object
Q38_B_Part_11                          object
Q38_B_OTHER                            object
Length: 369, dtype: object
In [5]:
df.head()
Out[5]:
Time from Start to Finish (seconds) Q1 Q2 Q3 Q4 Q5 Q6 Q7_Part_1 Q7_Part_2 Q7_Part_3 ... Q38_B_Part_3 Q38_B_Part_4 Q38_B_Part_5 Q38_B_Part_6 Q38_B_Part_7 Q38_B_Part_8 Q38_B_Part_9 Q38_B_Part_10 Q38_B_Part_11 Q38_B_OTHER
0 Duration (in seconds) What is your age (# years)? What is your gender? - Selected Choice In which country do you currently reside? What is the highest level of formal education ... Select the title most similar to your current ... For how many years have you been writing code ... What programming languages do you use on a reg... What programming languages do you use on a reg... What programming languages do you use on a reg... ... In the next 2 years, do you hope to become mor... In the next 2 years, do you hope to become mor... In the next 2 years, do you hope to become mor... In the next 2 years, do you hope to become mor... In the next 2 years, do you hope to become mor... In the next 2 years, do you hope to become mor... In the next 2 years, do you hope to become mor... In the next 2 years, do you hope to become mor... In the next 2 years, do you hope to become mor... In the next 2 years, do you hope to become mor...
1 910 50-54 Man India Bachelor’s degree Other 5-10 years Python R NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
2 784 50-54 Man Indonesia Master’s degree Program/Project Manager 20+ years NaN NaN SQL ... NaN NaN NaN NaN NaN NaN NaN NaN None NaN
3 924 22-24 Man Pakistan Master’s degree Software Engineer 1-3 years Python NaN NaN ... NaN NaN TensorBoard NaN NaN NaN NaN NaN NaN NaN
4 575 45-49 Man Mexico Doctoral degree Research Scientist 20+ years Python NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN None NaN

5 rows × 369 columns

In [6]:
# Remove the top row
df = df.iloc[1:, :]
In [7]:
# let's take a look at null values
df.isnull().sum() / df.shape[0]
Out[7]:
Time from Start to Finish (seconds)    0.000000
Q1                                     0.000000
Q2                                     0.000000
Q3                                     0.000000
Q4                                     0.000000
                                         ...   
Q38_B_Part_8                           0.960728
Q38_B_Part_9                           0.974358
Q38_B_Part_10                          0.894236
Q38_B_Part_11                          0.825126
Q38_B_OTHER                            0.985485
Length: 369, dtype: float64

I wanna make it easier to run analysis on each question. What if I put all questions in a dictionary along with their answers as a dataframe. This would help me understand each one rather than doing filters each time.

In [8]:
questions = {} 
# Key => question number
# Value => dataframe includes all answers (some has multiple answers or parts)

# let's start with keys
keys = list(dict.fromkeys([i.split('_')[0] for i in df.columns])) # Q1_Part_1
In [9]:
# for instance, each question that starts with (Q4) should be put together in the same df
# note: for Q1 if we use (startswith) questions like Q11,Q12, Q13, ... will be put accedently with Q1
# goes the same for Q2 and Q3

for key in keys:
    if key in ['Q1','Q2', 'Q3']:
        questions[key] = df[key]
    else:
        questions[key] = df[[q for q in df.columns.values if q.startswith(key)]]   
In [10]:
# let's look at kagglers age dist
fig = px.histogram(df, x="Q1")
fig.show()
In [11]:
# look at age VS how many years they've been writing code
# my guiss is older people tend to write more code, let's see
# this time i'll use heatmap
fig = px.density_heatmap(df, x='Q1', y='Q6', category_orders={'Q1': ['18-21','22-24','25-29','30-34','35-39','40-44','45-49','50-54', '55-59','60-69','70+'], 'Q6': ['I have never written code', '<1 year', '1-2 years', '3-5 years', '5-10 years', '10-20 years', '20+ years']})
fig.show()
In [12]:
# Q7 look at programming languages
# Rename columns to be (python, R, SQL, ....) instead of (Q7_Part_1, Q7_Part_2,...)
# so basically I will reanme each columns with the most frequesnt value
questions['Q7'].columns = list(questions['Q7'].mode().values)


# transform it to be each skill and the count
q7 = questions['Q7'].count().reset_index()

# rename columns from (level_0, 0) to (language, count)
q7.columns = ['language', 'count']
# sort by count
q7 = q7.sort_values('count', ascending=False)

fig = px.bar(q7, x='language', y='count')
fig.show()
In [13]:
q = questions['Q7'].copy()
q = q.assign(Roles=df['Q5'])
q.columns
Out[13]:
MultiIndex([(    'Python',),
            (         'R',),
            (       'SQL',),
            (         'C',),
            (       'C++',),
            (      'Java',),
            ('Javascript',),
            (     'Julia',),
            (     'Swift',),
            (      'Bash',),
            (    'MATLAB',),
            (      'None',),
            (     'Other',),
            (     'Roles',)],
           )
In [16]:
# I wanna understand everything like (eduaction level, pay, ...) by postion or role (data scientis, analyst,...)
# I will use the same approach to create data snippits for each role
In [14]:
roles = {}

for role in df['Q5'].unique():
    roles[role] = df[df['Q5'] == role]
In [15]:
roles.keys()
Out[15]:
dict_keys(['Other', 'Program/Project Manager', 'Software Engineer', 'Research Scientist', 'Currently not employed', 'Student', 'Data Scientist', 'Data Analyst', 'Machine Learning Engineer', 'Business Analyst', 'Data Engineer', 'Product Manager', 'Statistician', 'Developer Relations/Advocacy', 'DBA/Database Engineer'])
In [19]:
# How does education level (Q4) vary by role
In [16]:
# let's first look at the whole thing
edu = df['Q4'].value_counts()
fig = px.bar(q7, x=edu.index, y=edu)
fig.show()
In [17]:
# what about data scientis
ds_edu = roles['Data Scientist']['Q4'].value_counts()
fig = px.bar(q7, x=ds_edu.index, y=ds_edu)
fig.show()
In [22]:
# wow, most data scientists tend to get a master degree
# but you still can be a data scientist with bachelor's degree. Thank god
In [18]:
# let's make it for some roles
# have googled on how to filter df using dropdown menu and got it in the answer below
#https://stackoverflow.com/questions/59406167/plotly-how-to-filter-a-pandas-dataframe-using-a-dropdown-menu

fig = go.Figure()
fig.update_layout(
    title="Education Level By Role",
    xaxis_title="Education Level",
    yaxis_title="Count",
)

# set up ONE trace

fig.add_trace(go.Bar(x=edu.index, y=edu.values))

buttons = []

for role in roles.keys():
    buttons.append(dict(method='restyle',
                            label=role,
                            visible=True,
                            args=[{'y':[roles[role]['Q4'].value_counts().values],
                                   'x':[roles[role]['Q4'].value_counts().index],
                                   'type':'bar'}, [0]],
                            ))

# # Have no clue what is that and how it works, just wanna something to show
updatemenu = []
your_menu = dict()
updatemenu.append(your_menu)

updatemenu[0]['buttons'] = buttons
updatemenu[0]['direction'] = 'down'
updatemenu[0]['showactive'] = True

# add dropdown menus to the figure
fig.update_layout(showlegend=False, updatemenus=updatemenu)
fig.show()
In [24]:
# makes sense that role like research sientists are most likely to have a doctoral degree
# Data scientists most likely to have a master degree but again a lot don't
In [25]:
# will try to make bars represent percentage instead of row numbers
In [19]:
fig = go.Figure()
fig.update_layout(
    title="Comparing Education Level By Role",
    xaxis_title="Education Level",
    yaxis_title="persent",
)

# set up ONE trace

# let's make y as percent instead of row numbers
fig.add_trace(go.Bar(name='Role', x=edu.index, y=(edu.values/edu.sum())))

buttons = []

for role in roles.keys():
    role_edu = roles[role]['Q4'].value_counts()
    buttons.append(dict(method='restyle',
                            label=role,
                            visible=True,
                            args=[{'y':[role_edu.values/(role_edu.sum())],
                                   'x':[role_edu.index],
                                   'type':'bar'}, [0]],
                            ))

# # Have no clue what is that and how it works, just wanna something to show
updatemenu = []
your_menu = dict()
updatemenu.append(your_menu)

updatemenu[0]['buttons'] = buttons
updatemenu[0]['direction'] = 'down'
updatemenu[0]['showactive'] = True

# add dropdown menus to the figure
fig.update_layout(showlegend=False, updatemenus=updatemenu)
fig.show()
In [27]:
# it turns out the around 50% of data scientists have master degree
In [28]:
# Let's try to compare edu by roles in the same graph
# let's create two dropdowns
In [20]:
fig = go.Figure()
fig.update_layout(
    title="Comparing Education Level By Role",
    xaxis_title="Education Level",
    yaxis_title="persent",
)

# set up ONE trace

# let's make y as percent instead of row numbers
fig.add_trace(go.Bar(name="1st selection", x=edu.index, y=(edu.values/edu.sum())))




# it figures out we can create two button lists
buttons1 = []
buttons2 = []

for role in roles.keys():
    role_edu = roles[role]['Q4'].value_counts()
    buttons1.append(dict(method='restyle',
                            label=role,
                            visible=True,
                            args=[{'y':[role_edu.values/(role_edu.sum())],
                                   'x':[role_edu.index],
                                   'type':'bar'}, [0]],
                            ))
    
# it figures out we can add one more trace and buttons to display two roles at a time
fig.add_trace(go.Bar(name='2nd selection', x=edu.index, y=(edu.values/edu.sum())))   

buttons2 = []

for role in roles.keys():
    role_edu = roles[role]['Q4'].value_counts()
    buttons2.append(dict(method='restyle',
                            label=role,
                            visible=True,
                            args=[{'y':[role_edu.values/(role_edu.sum())],
                                   'x':[role_edu.index],
                                   'type':'bar'}, [1]], # just played around this index (0,1) and it figures
                                                         # 0 for 1st trace and 1 for the second, not sure but it works
                            ))
    

# I've got this now, we can use it to customize each button
#https://plotly.com/python/reference/layout/updatemenus/
updatemenu = [
    dict(buttons=buttons1,
         direction='down',
         pad={'r': 10, 't': 10, 'b': 10},
         showactive=True,
         x=0.1,
         xanchor='left',
         y=1.23,
         yanchor='top'
        ),
    dict(buttons=buttons2,
         direction='down',
         pad={'r': 10, 't': 10},
         showactive=True,
         x=0.5,
         xanchor='left',
         y=1.23,
         yanchor='top'
        )
]

# add dropdown menus to the figure
fig.update_layout(updatemenus=updatemenu)
fig.update_layout(annotations = [
    dict(text = '1st selection', x = 0, xref='paper', y=1.15, yref='paper', align='left', showarrow=False),
    dict(text = '2nd selection', x = 0.45, xref='paper', y=1.15, yref='paper', align='right', showarrow=False)

])
fig.show()
In [30]:
# let's look at what programming language kagglers recommend by role
In [21]:
recommended_languages = df['Q8'].value_counts()
In [22]:
fig = go.Figure()
fig.update_layout(
    title="Recommended languages By Role",
    xaxis_title="Programming Languages",
    yaxis_title="persent",
)

# set up ONE trace

# let's make y as percent instead of row numbers
fig.add_trace(go.Bar(name="1st selection", x=recommended_languages.index, y=(recommended_languages.values/recommended_languages.values.sum())))




# it figures out we can create two button lists
buttons1 = []

buttons1.append(dict(method='restyle',
                            label='All data',
                            visible=True,
                            args=[{'y':[recommended_languages.values/(recommended_languages.sum())],
                                   'x':[recommended_languages.index],
                                   'type':'bar'}, [0]],
                            ))

for role in roles.keys():
    recommended_languages_byrole = roles[role]['Q8'].value_counts()
    buttons1.append(dict(method='restyle',
                            label=role,
                            visible=True,
                            args=[{'y':[recommended_languages_byrole.values/(recommended_languages_byrole.sum())],
                                   'x':[recommended_languages_byrole.index],
                                   'type':'bar'}, [0]],
                            ))
    
# it figures out we can add one more trace and buttons to display two roles at a time
fig.add_trace(go.Bar(name='2nd selection', x=recommended_languages.index, y=(recommended_languages.values/recommended_languages.values.sum())))   

buttons2 = []

buttons2.append(dict(method='restyle',
                            label='All data',
                            visible=True,
                            args=[{'y':[recommended_languages.values/(recommended_languages.sum())],
                                   'x':[recommended_languages.index],
                                   'type':'bar'}, [1]],
                            ))

for role in roles.keys():
    recommended_languages_byrole = roles[role]['Q8'].value_counts()
    buttons2.append(dict(method='restyle',
                            label=role,
                            visible=True,
                            args=[{'y':[recommended_languages_byrole.values/(recommended_languages_byrole.sum())],
                                   'x':[recommended_languages_byrole.index],
                                   'type':'bar'}, [1]], # just played around this index (0,1) and it figures
                                                         # 0 for 1st trace and 1 for the second, not sure but it works
                            ))
    

# I've got this now, we can use it to customize each button
#https://plotly.com/python/reference/layout/updatemenus/
updatemenu = [
    dict(buttons=buttons1,
         direction='down',
         pad={'r': 10, 't': 10, 'b': 10},
         showactive=True,
         x=0.1,
         xanchor='left',
         y=1.15,
         yanchor='top'
        ),
    dict(buttons=buttons2,
         direction='down',
         pad={'r': 10, 't': 10},
         showactive=True,
         x=0.5,
         xanchor='left',
         y=1.15,
         yanchor='top'
        )
]

# add dropdown menus to the figure
fig.update_layout(updatemenus=updatemenu)
fig.update_layout(annotations = [
    dict(text = '1st selection', x = 0, xref='paper', y=1.15, yref='paper', align='left', showarrow=False),
    dict(text = '2nd selection', x = 0.45, xref='paper', y=1.15, yref='paper', align='right', showarrow=False)

])
fig.show()
In [33]:
# wow. About 80% recommedned Python
# evnen statisticians who use R most commonly, still recommend python though
# if you coming in and trying to understand what should i learn first, pyhton is the number 1 thing
In [34]:
# so good so far, but I wanna make a kind function that can help us generate those graphs
In [23]:
def filter_bars(role, data):
    df = data[data['Roles'] == role]
    q = df.drop('Roles', axis= 1).count().reset_index()
    q.columns = ['feature','Count']
    return (q.feature, q.Count/q.Count.sum())

def build_graph(question, Roles, Title):
    """
        first case: questions that have multiple columns
        I'll handle it as the same as Q7 above
    """
    if isinstance(question, pd.DataFrame):
        q_copy = question.copy()
        q_copy.columns = list(q_copy.mode().iloc[0,:])
        q_feature_count = q_copy.count().reset_index()
        q_feature_count.columns = ['feature','Count']
        q_feature_count = q_feature_count.sort_values('Count', ascending = False)
        q_copy['Roles'] = df.Q5
        
        fig = go.Figure(layout=go.Layout(title= go.layout.Title(text=Title)))
        #changed from role selection to selection 1
        fig.add_trace(go.Bar(name= 'Selection 1', x= q_feature_count.feature, y=(q_feature_count.Count/ q_feature_count.Count.sum())))

        buttons = []
        #added button for all data comparison
        buttons.append(dict(method='restyle',
                                label= 'All Samples',
                                visible=True,
                                args=[{'y':[(q_feature_count.Count/ q_feature_count.Count.sum())],
                                       'x':[q_feature_count.feature],
                                       'type':'bar'}, [0]], # the [0] at the end lets us know they are for the first trace
                                )
                          )
        
        for i in list(Roles.keys())[1:]:
            buttons.append(dict(method='restyle',
                                label= i,
                                visible=True,
                                args=[{'y':[filter_bars(i,q_copy)[1].values],
                                       'x':[filter_bars(i,q_copy)[0].values],
                                       'type':'bar'}, [0]], # the [0] at the end lets us know they are for the first trace
                                )
                          )
            
        fig.add_trace(go.Bar(name= 'Selection 2', x= q_feature_count.feature, y=(q_feature_count.Count/ q_feature_count.Count.sum())))

        buttons_2 = []
        #added button for all data comparison
        buttons_2.append(dict(method='restyle',
                                label= 'All Samples',
                                visible=True,
                                args=[{'y':[(q_feature_count.Count/ q_feature_count.Count.sum())],
                                       'x':[q_feature_count.feature],
                                       'type':'bar'}, [1]], # the [0] at the end lets us know they are for the first trace
                                )
                          )
        
        for i in list(Roles.keys())[1:]:
            buttons_2.append(dict(method='restyle',
                                label= i,
                                visible=True,
                                args=[{'y':[filter_bars(i,q_copy)[1].values],
                                       'x':[filter_bars(i,q_copy)[0].values],
                                       'type':'bar'}, [1]], # the [0] at the end lets us know they are for the first trace
                                )
                          )    
            
        button_layer_1_height = 1.15    
        updatemenus = list([
            dict(buttons=buttons,
                    direction="down",
                    pad={"r": 10, "t": 10},
                    showactive=True,
                    x=0.1,
                    xanchor="left",
                    y=button_layer_1_height,
                    yanchor="top"),
            dict(buttons=buttons_2,
                    direction="down",
                    pad={"r": 10, "t": 10},
                    showactive=True,
                    x=0.50,
                    xanchor="left",
                    y=button_layer_1_height,
                    yanchor="top")
        ])
        
        fig.update_layout( updatemenus=updatemenus)
        #added annotations next to dropdowns 
        fig.update_layout(
            annotations=[
                dict(text="Selection 1", x=0, xref="paper", y=1.1, yref="paper",
                                     align="left", showarrow=False),
                dict(text="Selection 2", x=0.45, xref="paper", y=1.1,
                                     yref="paper", showarrow=False)
            ])
        fig.update_xaxes(categoryorder= 'array', categoryarray= q_feature_count.feature)
        fig.show()
        
    else:
        print('else')
        """
            2nd case is questions that have only one column like Q2 and Q8 above
        """
        qnumber= question.copy()
        vcnts = qnumber.value_counts()
        qnumber = pd.concat([qnumber,df.Q5], axis =1)
        qnumber.columns = ['feature','Roles']
        
        fig = go.Figure(layout=go.Layout(title= go.layout.Title(text=Title)))
        #changed from role selection to selection 1
        fig.add_trace(go.Bar(name= 'Selection 1', x= vcnts.index, y=(vcnts.values/ vcnts.values.sum())))

        buttons = []

        #added button for all data comparison
        buttons.append(dict(method='restyle',
                                label= 'All Samples',
                                visible=True,
                                args=[{'y':[vcnts.values/ vcnts.values.sum()],
                                       'x':[vcnts.index],
                                       'type':'bar'}, [0]], # the [0] at the end lets us know they are for the first trace
                                )
                          )
        
        for i in list(Roles.keys())[1:]:
            qrole = qnumber[qnumber['Roles']==i].feature.value_counts()
            buttons.append(dict(method='restyle',
                                label= i,
                                visible=True,
                                args=[{'y':[qrole.values/qrole.values.sum()],
                                       'x':[qrole.index],
                                       'type':'bar'}, [0]], # the [0] at the end lets us know they are for the first trace
                                )
                          )
            
        fig.add_trace(go.Bar(name= 'Selection 1', x= vcnts.index, y=(vcnts.values/ vcnts.values.sum())))

        buttons_2 = []

        #added button for all data comparison
        buttons_2.append(dict(method='restyle',
                                label= 'All Samples',
                                visible=True,
                                args=[{'y':[vcnts.values/ vcnts.values.sum()],
                                       'x':[vcnts.index],
                                       'type':'bar'}, [1]], # the [0] at the end lets us know they are for the first trace
                                )
                          )
        
        for i in list(Roles.keys())[1:]:
            qrole = qnumber[qnumber['Roles']==i].feature.value_counts()
            buttons_2.append(dict(method='restyle',
                                label= i,
                                visible=True,
                                args=[{'y':[qrole.values/qrole.values.sum()],
                                       'x':[qrole.index],
                                       'type':'bar'}, [1]], # the [0] at the end lets us know they are for the first trace
                                )
                          )
        button_layer_1_height = 1.15    
        updatemenus = list([
            dict(buttons=buttons,
                    direction="down",
                    pad={"r": 10, "t": 10},
                    showactive=True,
                    x=0.1,
                    xanchor="left",
                    y=button_layer_1_height,
                    yanchor="top"),
            dict(buttons=buttons_2,
                    direction="down",
                    pad={"r": 10, "t": 10},
                    showactive=True,
                    x=0.50,
                    xanchor="left",
                    y=button_layer_1_height,
                    yanchor="top")
        ])
        
        fig.update_layout( updatemenus=updatemenus)
        #added annotations next to dropdowns 
        fig.update_layout(
            annotations=[
                dict(text="Selection 1", x=0, xref="paper", y=1.1, yref="paper",
                                     align="left", showarrow=False),
                dict(text="Selection 2", x=0.45, xref="paper", y=1.1,
                                     yref="paper", showarrow=False)
            ])
        fig.update_xaxes(categoryorder= 'array', categoryarray= vcnts.index)
        fig.show()
        
    return
In [36]:
# maybe it's not the most elegant way to do it but that's what i've come up with. I might revist it later
In [24]:
build_graph(questions['Q7'], roles, 'Popular Programming Language')
In [38]:
# Python, SQL, and R are the most popular ones
# All people tend to use pyhton much more than R except for Statisticians :)
# seems like analysts are using SQL a bit more than data scientists
In [25]:
build_graph(df['Q8'], roles, 'Recommended language')
else
In [26]:
build_graph(questions['Q9'], roles, 'IDE')
In [27]:
build_graph(questions['Q14'], roles, 'most common data viz library')
In [28]:
build_graph(questions['Q31'], roles, 'most common PI softwares')
In [29]:
build_graph(questions['Q38'], roles, 'Primary tool used in analytics')